3  Results

3.1 Our Data Story

Welcome to our culinary journey through nutritional data! Using data from the USDA, we’re diving deep into the nutritional profiles of various food categories like bread, meat, vegetables, and beverages. Imagine walking through a grocery store or preparing a meal—we’ll take you on a journey through the foods we often enjoy, starting with appetizers like bread, moving on to hearty main courses of meat, pairing it with fresh vegetables, and finally topping it all off with a refreshing beverage. This project isn’t just about numbers—it’s about exploring the stories behind the foods that nourish us.

Code
suppressPackageStartupMessages(library(tidyverse, quietly = TRUE))
suppressPackageStartupMessages(library(dplyr, quietly = TRUE))
suppressPackageStartupMessages(library(stringr, quietly = TRUE))
suppressPackageStartupMessages(library(redav, quietly = TRUE))
suppressPackageStartupMessages(library(GGally, quietly = TRUE))
data <- read_csv("Branded Nutrients_Dataset.csv",
                 show_col_types = FALSE)

data <- data |> 
  mutate(publishedDate = as.Date(publishedDate, format = "%m/%d/%Y"))

latest_data <- data |> 
  group_by(description, foodCategory, brandOwner, brandName) |> 
  slice_max(order_by = publishedDate, n = 1, with_ties = FALSE) |> 
  ungroup()

3.2 Breads

Let’s dive into the fascinating world of bread through our data visualization journey! As we begin our journey into the world of bread, let’s first explore the overall nutritional differences across various types of bread. In the first plot, we observe a clear separation of bread based on their nutritional attributes like protein content, fat, sugars, and sodium. Notably, whole wheat sandwich rolls stand out with significantly higher values for specific nutrients compared to others, suggesting that they pack more in terms of energy and macronutrients. This chart sets the stage for understanding how bread types can vary widely in their health benefits, inviting us to dive deeper into specific categories.

Code
# remotes::install_github("jtr13/redav")
# Filter rows where foodCategory is "Breads & Buns" and keep only selected columns
bread_data <- latest_data |> 
  filter(foodCategory == "Breads & Buns") |> 
  select(
    description, 
    foodCategory, 
    brandOwner, 
    brandName, 
    packageWeight, 
    `Protein G`, 
    `Total lipid (fat) G`, 
    `Carbohydrate, by difference G`, 
    `Energy KCAL`, 
    `Total Sugars G`, 
    `Sodium, Na MG`
  ) |> 
  drop_na()

# Adjusted function to parse packageWeight
parse_packageWeight <- function(data, food_type) {
  # Ensure food_type is provided
  if (missing(food_type)) {
    stop("The parameter 'food_type' must be specified as 'solid' or 'liquid'.")
  }
  
  # Validate food_type input
  if (!food_type %in% c("solid", "liquid")) {
    stop("Invalid 'food_type'. Please use 'solid' or 'liquid'.")
  }
  
  # Solid food logic
  if (food_type == "solid") {
    patterns <- list(
      lbs = "(\\d+(\\.\\d+)?)\\s*[lL][bB][sS]?",
      g = "(\\d+(\\.\\d+)?)\\s*[gG]",
      oz = "(\\d+(\\.\\d+)?)\\s*[oO][zZ]",
      kg = "(\\d+(\\.\\d+)?)\\s*[kK][gG]"
    )
    
    for (unit in names(patterns)) {
      extracted <- stringr::str_extract(data$packageWeight, patterns[[unit]])
      data[[unit]] <- as.numeric(stringr::str_extract(extracted, "\\d+(\\.\\d+)?"))
    }
  }
  
  # Liquid food logic (currently empty)
  if (food_type == "liquid") {
    patterns <- list(
      gal = "(\\d+(\\.\\d+)?)\\s*[gG][aA][lL]",
      l = "(\\d+(\\.\\d+)?)\\s*[lL]\\b",  # Matches 'L' but not 'mL'
      ml = "(\\d+(\\.\\d+)?)\\s*[mM][lL]",
      fl_oz = "(\\d+(\\.\\d+)?)\\s*[fF][lL]\\s*[oO][zZ]",
      qt = "(\\d+(\\.\\d+)?)\\s*[qQ][tT][sS]?"  # Matches 'QT', 'QTS', 'Quart'
    )
    
    for (unit in names(patterns)) {
      extracted <- stringr::str_extract(data$packageWeight, patterns[[unit]])
      data[[unit]] <- as.numeric(stringr::str_extract(extracted, "\\d+(\\.\\d+)?"))
    }
  }


  return(data)
}

# Fill the missing values for the aligned unit
fill_missing <- function(data, food_type) {
  # Ensure food_type is provided
  if (missing(food_type)) {
    stop("The parameter 'food_type' must be specified as 'solid' or 'liquid'.")
  }
  
  # Validate food_type input
  if (!food_type %in% c("solid", "liquid")) {
    stop("Invalid 'food_type'. Please use 'solid' or 'liquid'.")
  }
  
  # Solid food logic
  if (food_type == "solid") {
    data <- parse_packageWeight(data, food_type) |>
      mutate(
        g = case_when(
          !is.na(lbs) ~ lbs * 454,
          !is.na(oz) ~ floor(oz * 28.3495231),
          !is.na(kg) ~ kg * 1000,
          TRUE ~ g
        )
      ) |>
      filter(!is.na(g)) |>
      select(-lbs, -oz, -kg) |>
      rename(packageWeight_g = g)
  }
  
  # Liquid food logic (currently empty)
  if (food_type == "liquid") {
    data <- parse_packageWeight(data, food_type) |>
      mutate(
        ml = case_when(
          !is.na(l) ~ l * 1000,
          !is.na(fl_oz) ~ floor(fl_oz * 29.5735),
          !is.na(qt) ~ qt * floor(qt * 946.352946),
          !is.na(gal) ~ floor(gal * 3785.411784 ),
          TRUE ~ ml
        )
      ) |>
      filter(!is.na(ml)) |>
      select(-l, -fl_oz, -qt, -gal) |>
      rename(packageWeight_ml = ml)
  }
  
  return(data)
}

# Function to align weight scale
align_weight_scale <- function(data, food_type) {
  # Step 1: Call fill_missing based on food_type
  data <- fill_missing(data, food_type)
  
  if (food_type == "solid"){
      # Step 2: Perform conversion to align weight scale
      weight_column <- paste0("packageWeight_", "g")
      
      if (!weight_column %in% colnames(data)) {
        stop(paste("Column", weight_column, "is missing in the data."))
      }
      
      data <- data |>
        mutate(ratio = .data[[weight_column]] / 100)
      
      numeric_cols <- sapply(data, is.numeric)
      numeric_cols <- names(numeric_cols[numeric_cols])
      numeric_cols <- setdiff(numeric_cols, c(weight_column, "packageWeight", "ratio"))
      
      data[numeric_cols] <- data[numeric_cols] / data$ratio
      
      data <- data |>
        select(-all_of(weight_column), -packageWeight, -ratio)
      
  }
  else if (food_type == "liquid"){
      # Step 2: Perform conversion to align weight scale
      weight_column <- paste0("packageWeight_", "ml")
      
      if (!weight_column %in% colnames(data)) {
        stop(paste("Column", weight_column, "is missing in the data."))
      }
      
      data <- data |>
        mutate(ratio = .data[[weight_column]] / 100)
      
      numeric_cols <- sapply(data, is.numeric)
      numeric_cols <- names(numeric_cols[numeric_cols])
      numeric_cols <- setdiff(numeric_cols, c(weight_column, "packageWeight", "ratio"))
      
      data[numeric_cols] <- data[numeric_cols] / data$ratio
      
      data <- data |>
        select(-all_of(weight_column), -packageWeight, -ratio)
      
  }
  else {
    print("Invalid food_type")
  }

  return(data)
}

bread_data <- align_weight_scale(bread_data, "solid")

bread_data <- bread_data |>
  mutate( 
    description = case_when(
      description == "100% WHOLE WHEAT SANDWICH THINS ROLLS, 100% WHOLE WHEAT" ~ "WHOLE WHEAT SANDWICH THINS ROLLS",
      description == "2 BRIOCHE CHOCOLATE POPOVERS, 2 BRIOCHE" ~ "BRIOCHE CHOCOLATE POPOVERS",
      description == "7 GRAIN OLIVE OIL CROSTINI, 7 GRAIN OLIVE OIL" ~ "GRAIN OLIVE OIL CROSTINI",
      description == "8 GRAIN ORGANIC GRAIN HEARTY WHEAT FLATBREAD, 8 GRAIN" ~ "GRAIN HEARTY WHEAT FLATBREAD",
      TRUE ~ description  # Keep the original description for all other rows
    )
  )

options(warn = -1)
suppressWarnings(
  draw_biplot(bread_data, point_labels = TRUE, point_size = 2, label_size = 3, fix_sign = TRUE)
)

Code
options(warn = 0)

Bagels Biplot

Next, we narrow our focus to bagels from different brands. The second plot reveals interesting insights into how the nutritional profiles of bagels can differ dramatically depending on the brand. Some brands, like Wake-N-Bagel, are high in total lipids and sugars, while others, such as Kroger, show more balanced profiles. This variation highlights the importance of brand selection when choosing bagels for a meal, as their nutritional contributions to our diet can vary significantly.

Code
bagel_data <- bread_data |> 
  filter(str_detect(description, regex("bagel", ignore_case = TRUE)))

draw_biplot(bagel_data |> select(-description, -foodCategory, -brandOwner),
            label_size = 5, fix_sign = TRUE,)

Wheat Bread Biplot

Lastly, our journey concludes with an in-depth look at wheat bread across brands. The third plot shows how brands like Simply Roundys stand apart with exceptionally high carbohydrate content, while others, such as Angelic Bakehouse, maintain a balanced nutritional profile. This comparison emphasizes how even within the same bread category, brands play a crucial role in defining the nutritional quality. Whether you are shopping for a hearty slice of wheat bread or a light and nutritious option, these insights offer valuable guidance in making informed choices at the grocery store.

Code
options(warn = -1)
wheat_bread_data <- bread_data |> 
  filter(str_detect(description, regex("wheat", ignore_case = TRUE)))

draw_biplot(wheat_bread_data |> select(-description, -foodCategory, -brandOwner),
            label_size = 3, fix_sign = TRUE,)

Code
options(warn = 0)

3.3 Meat

Now, let’s dive into the heart of the meal—meat. In the first plot, we explore the nutritional composition of Fish, Poultry, Chicken & Turkey, and Processed Pork. The visual comparison reveals that fish consistently scores high in protein content while keeping fat levels moderate, making it a popular choice for health-conscious eaters. Poultry and chicken tend to cluster lower in fat and cholesterol compared to processed pork, which stands out with higher sodium levels—a likely result of curing and preservation processes. This contrast offers a clear distinction between the natural and processed meat options.

Code
# Fish
fish_origin <- latest_data[latest_data$foodCategory %in% "Fish  Unprepared/Unprocessed", ] |> 
  mutate(packageWeight = str_extract(description, "\\d+\\s?(OZ|LB)")) |> 
  filter(!is.na(packageWeight)) |>  # Drop rows where packageWeight is NA
  mutate(Meat_Type = "Fish")

# Chicken
chicken_origin <- latest_data[latest_data$foodCategory %in% "Poultry, Chicken & Turkey", ]  |> 
  filter(!is.na(packageWeight)) |> 
  mutate(Meat_Type = "Poultry, Chicken & Turkey")

# Processed_Pork
Processed_Pork_origin <- latest_data[
  latest_data$foodCategory %in% "Meat/Poultry/Other Animals  Prepared/Processed" &
  grepl("BACN|HAM|Sausage", latest_data$description, ignore.case = TRUE),
] |> 
  mutate(
    packageWeight = str_extract(description, "\\d+#") |> 
                     str_replace("#", " lbs"),
    
    packageWeight = if_else(
      is.na(packageWeight),
      str_extract(description, "\\d+\\s?(oz.|lb.)"),
      packageWeight
    ) ,
    Meat_Type = "Processed_Pork"
  ) |> 
  filter(!is.na(packageWeight))

# Beef and Pork
beef_Pork_origin <- latest_data[
  latest_data$foodCategory %in% "Other Meats" &
  grepl("PORK|BEEF", latest_data$description, ignore.case = TRUE), 
] |> 
  filter(!is.na(packageWeight)) |> 
  mutate(
    Meat_Type = case_when(
      grepl("BEEF", description, ignore.case = TRUE) ~ "Beef",
      grepl("PORK", description, ignore.case = TRUE) ~ "Pork",
      TRUE ~ NA_character_  
    )
  )

# Merge the whole meat data
data_list1 <- list(fish_origin, chicken_origin, Processed_Pork_origin)

combined_meat_data_1 <- bind_rows(data_list1) |> 
  select(
    description, 
    packageWeight, 
    `Protein G`, 
    `Total lipid (fat) G`, 
    `Energy KCAL`, 
    `Calcium, Ca MG`,
    `Iron, Fe MG`,
    `Sodium, Na MG`,
    `Cholesterol MG`,
    `Meat_Type`
  ) |> 
  drop_na()

data_cleaned_1 <- align_weight_scale(combined_meat_data_1, food_type = "solid")

data_filtered_1 <- data_cleaned_1 %>%
  filter(
    !(`Sodium, Na MG` > 500),
    !(`Total lipid (fat) G` > 100)
  )


ggparcoord(
  data = data_filtered_1,
  columns = 2:8,                    # 營養變量的列
  groupColumn = 9,                 # 使用 Meat_Type 作為分組
  alpha = 0.5,                      # 透明度
  scale = "uniminmax",              # 標準化數據
  title = "Parallel Coordinate Plot with Fish, Chicken and Processed Pork"
) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 18, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 18),
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),
    legend.title = element_text(size = 18),
    legend.text = element_text(size = 18),
    plot.title = element_text(size = 20)
  ) +
  labs(x = "Nutritional Variables", y = "Scaled Values", color = "Meat Type")

The second chart shifts focus to Beef, Pork, and Poultry (Chicken & Turkey). Beef dominates in protein content and iron, highlighting its role as a nutrient powerhouse, though this comes at the cost of higher fat and cholesterol levels. Pork, sitting between beef and poultry, offers a balanced profile, but its variability in sodium levels stands out—some cuts are lean and natural, while others are heavily processed. Poultry emerges as the leanest choice, appealing to those prioritizing low-fat and low-calorie options.

Together, these plots not only provide a nutrient-rich overview of various meat categories but also highlight the trade-offs that come with flavor and processing, helping us appreciate the diverse choices available at the meat counter.

Code
# Merge the whole meat data
data_list2 <- list(chicken_origin, beef_Pork_origin)

combined_meat_data_2 <- bind_rows(data_list2) |> 
  select(
    description, 
    packageWeight, 
    `Protein G`, 
    `Total lipid (fat) G`, 
    `Energy KCAL`, 
    `Calcium, Ca MG`,
    `Iron, Fe MG`,
    `Sodium, Na MG`,
    `Cholesterol MG`,
    `Meat_Type`
  ) |> 
  drop_na()


data_cleaned_2 <- align_weight_scale(combined_meat_data_2, food_type = "solid")

data_filtered_2 <- data_cleaned_2 %>%
  filter(
    !(`Sodium, Na MG` > 500),
    !(`Total lipid (fat) G` > 100)
  )


ggparcoord(
  data = data_filtered_2,
  columns = 2:8,                   
  groupColumn = 9,                 
  alpha = 0.5,                      
  scale = "uniminmax",            
  title = "Parallel Coordinate Plot with Chicken, Beef and Pork"
) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 18, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 18),
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),
    legend.title = element_text(size = 18),
    legend.text = element_text(size = 18),
    plot.title = element_text(size = 20)
  ) +
  labs(x = "Nutritional Variables", y = "Scaled Values", color = "Meat Type")

3.4 Vegetable

Fiber

As we move into the world of vegetables, we delve into their nutritional essence—fiber. The Cleveland Dot Plot showcases the top 10 fiber-rich vegetables or fruits, highlighting their contributions to dietary fiber per 100 grams. At the pinnacle, we see “Actually Spicy Crunchy Lentils Superfood Snack,” a clear leader in fiber density, followed by other nutrient-packed options like “100% Natural Organic Chia Seeds” and “100% Chunky Minis Avocado.” This visualization not only emphasizes the variety within vegetables but also their potential as high-fiber staples in a balanced diet. It invites us to explore how different vegetables contribute uniquely to our nutritional needs, inspiring informed choices for a healthier lifestyle.

Code
# Define the list of food categories to filter
selected_categories <- c(
  #"Vegetables - Prepared/Processed",
  #"Vegetables  Prepared/Processed",
  "Vegetable and Lentil Mixes",
  "Tomatoes",
  "Pre-Packaged Fruit & Vegetables",
  "Frozen Vegetables",
  #"Fruit - Prepared/Processed",
  "Fruit  Prepared/Processed",
  "Fruits  Unprepared/Unprocessed (Shelf Stable)"
)

# Filter rows based on foodCategory
vege_fruit_data <- latest_data |> 
  filter(foodCategory %in% selected_categories) |> 
  select(-fdcId)

vege_fruit_data <- align_weight_scale(vege_fruit_data, "solid")

# Select top 10 rows based on Fiber
top_fiber_data <- vege_fruit_data |>
  rename(Fiber = `Fiber, total dietary G`) |> 
  arrange(desc(Fiber)) |>
  slice_head(n = 10)  # Select the top 10 rows

# Draw Cleveland dot plot
ggplot(top_fiber_data, aes(x = Fiber, y = fct_reorder(description, Fiber))) +
  geom_point(color = 'blue') +
  ggtitle("Cleveland Dot Plot for Top 10 Fiber-Rich Vegetables or Fruits") +
  xlab("Fiber g/100g") +
  ylab("") +
  theme_linedraw() +
  theme(
    axis.text.y = element_text(size = 10),  # Adjust Y-axis label size for better readability
    axis.text.x = element_text(size = 15),
    axis.title.x = element_text(size = 15),  
    plot.title = element_text(size = 20)
  )+
  theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank())

3.5 Desserts

As we approach the dessert aisle, the allure of cakes and sweets draws our attention. In this scatter plot, we delve into the relationship between sugar content and calorie count across various cake brands. Each dot represents a brand’s offering, revealing interesting patterns. Brands like “Tastykake” stand out with higher sugar and calorie levels, highlighting their indulgent treats. Meanwhile, brands such as “Schnucks” and “Kroger” show a mix of options, balancing between lower and moderate sugar levels with corresponding calorie ranges. This visual narrative underscores how desserts can vary significantly by brand, providing insight into making informed choices that cater to personal preferences and dietary goals.

Code
options(warn = -1)
snack_origin <- latest_data[latest_data$foodCategory %in% "Cakes, Cupcakes, Snack Cakes", ] |> 
  select(
    description, 
    packageWeight, 
    brandName,
    `Energy KCAL`, 
    `Total Sugars G`
  ) |> 
  drop_na()
snack_cleaned <- align_weight_scale(snack_origin, food_type = "solid")

brand_freq <- table(snack_cleaned$brandName)

# Filter for brands appearing more than 2 times
brands_more_than_two <- names(brand_freq[brand_freq > 3])

snack <- snack_cleaned[snack_cleaned$brandName %in% brands_more_than_two, ]

ggplot(snack, aes(x = `Energy KCAL`, y = `Total Sugars G`, color = brandName)) +
  geom_point(size = 2, alpha=0.7) + # Add points with size
  theme_minimal() + # Minimal theme
  labs(
    title = "Scatter Plot: Calories vs Sugar",
    x = "Calories (Energy KCAL)",
    y = "Total Sugars (G)",
    color = "Brand Name"
  ) +
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    axis.title.x = element_text(size = 15),  
    axis.title.y = element_text(size = 15),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 12),
    plot.title = element_text(size = 20)
  )

Code
options(warn = 0)

3.6 Drinks

apple_juice

After indulging in sweet cakes and desserts, it’s time to quench our thirst. We move to the drinks section, where two classic beverages take the spotlight: apple juice and milk. Each sip of these drinks tells a different story about their brand and nutritional content.

First, we explore the vibrant world of apple juice. Picture yourself in an orchard, choosing the juiciest apples. The PCA chart shows how brands like R.W. Knudsen shine with their high vitamin C content, perhaps reminiscent of freshly squeezed juice bursting with nutrients. On the other hand, brands like Old Orchard cluster together, suggesting a more consistent but less adventurous profile. Whether you’re seeking a health boost or just a familiar taste, the choice is yours.

Code
options(warn = -1)
apple_juice_data <- latest_data |> 
  filter(
    str_detect(description, regex("Apple", ignore_case = TRUE)) &  # Contains "Apple"
    str_detect(description, regex("Juice", ignore_case = TRUE))   # Contains "Juice"
  ) |>
  select(
    description,
    brandName, 
    packageWeight, 
    `Vitamin C, total ascorbic acid MG`, 
    `Potassium, K MG`,
    `Energy KCAL`, 
    `Total Sugars G`
  ) |> 
  drop_na()

apple_juice_data <- align_weight_scale(apple_juice_data, "liquid")

draw_biplot(apple_juice_data |> select(-description))

Code
options(warn = 0)

milk

Next, we pour a glass of milk—creamy, rich, and comforting. But not all milk is the same! The second PCA chart reveals how brands differentiate. Kemps, sitting apart from the others, suggests a richer, more indulgent milk, perhaps perfect for those seeking higher energy and nutrient content. Meanwhile, brands like Fairlife and Country Fresh offer a more balanced profile, ideal for everyday consumption.

Each glass tells its own story, inviting us to reflect on what we value most in our drinks—whether it’s indulgence, health, or tradition. And so, our supermarket journey continues, as every aisle reveals another layer of discovery.

Code
options(warn = -1)
milk_data <- latest_data |> 
  filter(foodCategory == "Milk") |> 
  filter(
    str_detect(description, regex("Milk", ignore_case = TRUE)), 
    !str_detect(description, regex("CHOCOLATE|STRAWBERRY", ignore_case = TRUE))
  ) |>
  select(
    description,
    brandName, 
    packageWeight, 
    `Calcium, Ca MG`, 
    `Total lipid (fat) G`,
    `Protein G`,
    `Potassium, K MG`,
    `Energy KCAL`, 
    `Total Sugars G`
  ) |> 
  drop_na()

milk_data <- align_weight_scale(milk_data, "liquid")
draw_biplot(milk_data |> select(-description))

Code
options(warn = 0)